fastq filesThis step is performed on the O2 cluster. The fastq file quality was checked using FastQC and MultiQC. They are aligned to Ensembl mm10 genome and counted using Salmon pseudoaligner. Output sf files were transfered from O2 to local machine for further processing in R.
suppressMessages(
c(library(DESeq2),
library(limma),
library(tximport),
library(gridExtra),
library(ensembldb),
library(EnsDb.Mmusculus.v79),
library(grid),
library(ggplot2),
library(lattice),
library(reshape),
library(mixOmics),
library(gplots),
library(RColorBrewer),
library(readr),
library(dplyr),
library(VennDiagram),
library(clusterProfiler),
library(DOSE),
library(org.Mm.eg.db),
library(pathview),
library(AnnotationDbi))
)
## Warning: package 'ensembldb' was built under R version 3.5.3
## Warning: package 'GenomicFeatures' was built under R version 3.5.3
## [1] "DESeq2" "SummarizedExperiment" "DelayedArray"
## [4] "BiocParallel" "matrixStats" "Biobase"
## [7] "GenomicRanges" "GenomeInfoDb" "IRanges"
## [10] "S4Vectors" "BiocGenerics" "parallel"
## [13] "stats4" "stats" "graphics"
## [16] "grDevices" "utils" "datasets"
## [19] "methods" "base" "limma"
## [22] "DESeq2" "SummarizedExperiment" "DelayedArray"
## [25] "BiocParallel" "matrixStats" "Biobase"
## [28] "GenomicRanges" "GenomeInfoDb" "IRanges"
## [31] "S4Vectors" "BiocGenerics" "parallel"
## [34] "stats4" "stats" "graphics"
## [37] "grDevices" "utils" "datasets"
## [40] "methods" "base" "tximport"
## [43] "limma" "DESeq2" "SummarizedExperiment"
## [46] "DelayedArray" "BiocParallel" "matrixStats"
## [49] "Biobase" "GenomicRanges" "GenomeInfoDb"
## [52] "IRanges" "S4Vectors" "BiocGenerics"
## [55] "parallel" "stats4" "stats"
## [58] "graphics" "grDevices" "utils"
## [61] "datasets" "methods" "base"
## [64] "gridExtra" "tximport" "limma"
## [67] "DESeq2" "SummarizedExperiment" "DelayedArray"
## [70] "BiocParallel" "matrixStats" "Biobase"
## [73] "GenomicRanges" "GenomeInfoDb" "IRanges"
## [76] "S4Vectors" "BiocGenerics" "parallel"
## [79] "stats4" "stats" "graphics"
## [82] "grDevices" "utils" "datasets"
## [85] "methods" "base" "ensembldb"
## [88] "AnnotationFilter" "GenomicFeatures" "AnnotationDbi"
## [91] "gridExtra" "tximport" "limma"
## [94] "DESeq2" "SummarizedExperiment" "DelayedArray"
## [97] "BiocParallel" "matrixStats" "Biobase"
## [100] "GenomicRanges" "GenomeInfoDb" "IRanges"
## [103] "S4Vectors" "BiocGenerics" "parallel"
## [106] "stats4" "stats" "graphics"
## [109] "grDevices" "utils" "datasets"
## [112] "methods" "base" "EnsDb.Mmusculus.v79"
## [115] "ensembldb" "AnnotationFilter" "GenomicFeatures"
## [118] "AnnotationDbi" "gridExtra" "tximport"
## [121] "limma" "DESeq2" "SummarizedExperiment"
## [124] "DelayedArray" "BiocParallel" "matrixStats"
## [127] "Biobase" "GenomicRanges" "GenomeInfoDb"
## [130] "IRanges" "S4Vectors" "BiocGenerics"
## [133] "parallel" "stats4" "stats"
## [136] "graphics" "grDevices" "utils"
## [139] "datasets" "methods" "base"
## [142] "grid" "EnsDb.Mmusculus.v79" "ensembldb"
## [145] "AnnotationFilter" "GenomicFeatures" "AnnotationDbi"
## [148] "gridExtra" "tximport" "limma"
## [151] "DESeq2" "SummarizedExperiment" "DelayedArray"
## [154] "BiocParallel" "matrixStats" "Biobase"
## [157] "GenomicRanges" "GenomeInfoDb" "IRanges"
## [160] "S4Vectors" "BiocGenerics" "parallel"
## [163] "stats4" "stats" "graphics"
## [166] "grDevices" "utils" "datasets"
## [169] "methods" "base" "ggplot2"
## [172] "grid" "EnsDb.Mmusculus.v79" "ensembldb"
## [175] "AnnotationFilter" "GenomicFeatures" "AnnotationDbi"
## [178] "gridExtra" "tximport" "limma"
## [181] "DESeq2" "SummarizedExperiment" "DelayedArray"
## [184] "BiocParallel" "matrixStats" "Biobase"
## [187] "GenomicRanges" "GenomeInfoDb" "IRanges"
## [190] "S4Vectors" "BiocGenerics" "parallel"
## [193] "stats4" "stats" "graphics"
## [196] "grDevices" "utils" "datasets"
## [199] "methods" "base" "lattice"
## [202] "ggplot2" "grid" "EnsDb.Mmusculus.v79"
## [205] "ensembldb" "AnnotationFilter" "GenomicFeatures"
## [208] "AnnotationDbi" "gridExtra" "tximport"
## [211] "limma" "DESeq2" "SummarizedExperiment"
## [214] "DelayedArray" "BiocParallel" "matrixStats"
## [217] "Biobase" "GenomicRanges" "GenomeInfoDb"
## [220] "IRanges" "S4Vectors" "BiocGenerics"
## [223] "parallel" "stats4" "stats"
## [226] "graphics" "grDevices" "utils"
## [229] "datasets" "methods" "base"
## [232] "reshape" "lattice" "ggplot2"
## [235] "grid" "EnsDb.Mmusculus.v79" "ensembldb"
## [238] "AnnotationFilter" "GenomicFeatures" "AnnotationDbi"
## [241] "gridExtra" "tximport" "limma"
## [244] "DESeq2" "SummarizedExperiment" "DelayedArray"
## [247] "BiocParallel" "matrixStats" "Biobase"
## [250] "GenomicRanges" "GenomeInfoDb" "IRanges"
## [253] "S4Vectors" "BiocGenerics" "parallel"
## [256] "stats4" "stats" "graphics"
## [259] "grDevices" "utils" "datasets"
## [262] "methods" "base" "mixOmics"
## [265] "MASS" "reshape" "lattice"
## [268] "ggplot2" "grid" "EnsDb.Mmusculus.v79"
## [271] "ensembldb" "AnnotationFilter" "GenomicFeatures"
## [274] "AnnotationDbi" "gridExtra" "tximport"
## [277] "limma" "DESeq2" "SummarizedExperiment"
## [280] "DelayedArray" "BiocParallel" "matrixStats"
## [283] "Biobase" "GenomicRanges" "GenomeInfoDb"
## [286] "IRanges" "S4Vectors" "BiocGenerics"
## [289] "parallel" "stats4" "stats"
## [292] "graphics" "grDevices" "utils"
## [295] "datasets" "methods" "base"
## [298] "gplots" "mixOmics" "MASS"
## [301] "reshape" "lattice" "ggplot2"
## [304] "grid" "EnsDb.Mmusculus.v79" "ensembldb"
## [307] "AnnotationFilter" "GenomicFeatures" "AnnotationDbi"
## [310] "gridExtra" "tximport" "limma"
## [313] "DESeq2" "SummarizedExperiment" "DelayedArray"
## [316] "BiocParallel" "matrixStats" "Biobase"
## [319] "GenomicRanges" "GenomeInfoDb" "IRanges"
## [322] "S4Vectors" "BiocGenerics" "parallel"
## [325] "stats4" "stats" "graphics"
## [328] "grDevices" "utils" "datasets"
## [331] "methods" "base" "RColorBrewer"
## [334] "gplots" "mixOmics" "MASS"
## [337] "reshape" "lattice" "ggplot2"
## [340] "grid" "EnsDb.Mmusculus.v79" "ensembldb"
## [343] "AnnotationFilter" "GenomicFeatures" "AnnotationDbi"
## [346] "gridExtra" "tximport" "limma"
## [349] "DESeq2" "SummarizedExperiment" "DelayedArray"
## [352] "BiocParallel" "matrixStats" "Biobase"
## [355] "GenomicRanges" "GenomeInfoDb" "IRanges"
## [358] "S4Vectors" "BiocGenerics" "parallel"
## [361] "stats4" "stats" "graphics"
## [364] "grDevices" "utils" "datasets"
## [367] "methods" "base" "readr"
## [370] "RColorBrewer" "gplots" "mixOmics"
## [373] "MASS" "reshape" "lattice"
## [376] "ggplot2" "grid" "EnsDb.Mmusculus.v79"
## [379] "ensembldb" "AnnotationFilter" "GenomicFeatures"
## [382] "AnnotationDbi" "gridExtra" "tximport"
## [385] "limma" "DESeq2" "SummarizedExperiment"
## [388] "DelayedArray" "BiocParallel" "matrixStats"
## [391] "Biobase" "GenomicRanges" "GenomeInfoDb"
## [394] "IRanges" "S4Vectors" "BiocGenerics"
## [397] "parallel" "stats4" "stats"
## [400] "graphics" "grDevices" "utils"
## [403] "datasets" "methods" "base"
## [406] "dplyr" "readr" "RColorBrewer"
## [409] "gplots" "mixOmics" "MASS"
## [412] "reshape" "lattice" "ggplot2"
## [415] "grid" "EnsDb.Mmusculus.v79" "ensembldb"
## [418] "AnnotationFilter" "GenomicFeatures" "AnnotationDbi"
## [421] "gridExtra" "tximport" "limma"
## [424] "DESeq2" "SummarizedExperiment" "DelayedArray"
## [427] "BiocParallel" "matrixStats" "Biobase"
## [430] "GenomicRanges" "GenomeInfoDb" "IRanges"
## [433] "S4Vectors" "BiocGenerics" "parallel"
## [436] "stats4" "stats" "graphics"
## [439] "grDevices" "utils" "datasets"
## [442] "methods" "base" "VennDiagram"
## [445] "futile.logger" "dplyr" "readr"
## [448] "RColorBrewer" "gplots" "mixOmics"
## [451] "MASS" "reshape" "lattice"
## [454] "ggplot2" "grid" "EnsDb.Mmusculus.v79"
## [457] "ensembldb" "AnnotationFilter" "GenomicFeatures"
## [460] "AnnotationDbi" "gridExtra" "tximport"
## [463] "limma" "DESeq2" "SummarizedExperiment"
## [466] "DelayedArray" "BiocParallel" "matrixStats"
## [469] "Biobase" "GenomicRanges" "GenomeInfoDb"
## [472] "IRanges" "S4Vectors" "BiocGenerics"
## [475] "parallel" "stats4" "stats"
## [478] "graphics" "grDevices" "utils"
## [481] "datasets" "methods" "base"
## [484] "clusterProfiler" "VennDiagram" "futile.logger"
## [487] "dplyr" "readr" "RColorBrewer"
## [490] "gplots" "mixOmics" "MASS"
## [493] "reshape" "lattice" "ggplot2"
## [496] "grid" "EnsDb.Mmusculus.v79" "ensembldb"
## [499] "AnnotationFilter" "GenomicFeatures" "AnnotationDbi"
## [502] "gridExtra" "tximport" "limma"
## [505] "DESeq2" "SummarizedExperiment" "DelayedArray"
## [508] "BiocParallel" "matrixStats" "Biobase"
## [511] "GenomicRanges" "GenomeInfoDb" "IRanges"
## [514] "S4Vectors" "BiocGenerics" "parallel"
## [517] "stats4" "stats" "graphics"
## [520] "grDevices" "utils" "datasets"
## [523] "methods" "base" "DOSE"
## [526] "clusterProfiler" "VennDiagram" "futile.logger"
## [529] "dplyr" "readr" "RColorBrewer"
## [532] "gplots" "mixOmics" "MASS"
## [535] "reshape" "lattice" "ggplot2"
## [538] "grid" "EnsDb.Mmusculus.v79" "ensembldb"
## [541] "AnnotationFilter" "GenomicFeatures" "AnnotationDbi"
## [544] "gridExtra" "tximport" "limma"
## [547] "DESeq2" "SummarizedExperiment" "DelayedArray"
## [550] "BiocParallel" "matrixStats" "Biobase"
## [553] "GenomicRanges" "GenomeInfoDb" "IRanges"
## [556] "S4Vectors" "BiocGenerics" "parallel"
## [559] "stats4" "stats" "graphics"
## [562] "grDevices" "utils" "datasets"
## [565] "methods" "base" "org.Mm.eg.db"
## [568] "DOSE" "clusterProfiler" "VennDiagram"
## [571] "futile.logger" "dplyr" "readr"
## [574] "RColorBrewer" "gplots" "mixOmics"
## [577] "MASS" "reshape" "lattice"
## [580] "ggplot2" "grid" "EnsDb.Mmusculus.v79"
## [583] "ensembldb" "AnnotationFilter" "GenomicFeatures"
## [586] "AnnotationDbi" "gridExtra" "tximport"
## [589] "limma" "DESeq2" "SummarizedExperiment"
## [592] "DelayedArray" "BiocParallel" "matrixStats"
## [595] "Biobase" "GenomicRanges" "GenomeInfoDb"
## [598] "IRanges" "S4Vectors" "BiocGenerics"
## [601] "parallel" "stats4" "stats"
## [604] "graphics" "grDevices" "utils"
## [607] "datasets" "methods" "base"
## [610] "pathview" "org.Hs.eg.db" "org.Mm.eg.db"
## [613] "DOSE" "clusterProfiler" "VennDiagram"
## [616] "futile.logger" "dplyr" "readr"
## [619] "RColorBrewer" "gplots" "mixOmics"
## [622] "MASS" "reshape" "lattice"
## [625] "ggplot2" "grid" "EnsDb.Mmusculus.v79"
## [628] "ensembldb" "AnnotationFilter" "GenomicFeatures"
## [631] "AnnotationDbi" "gridExtra" "tximport"
## [634] "limma" "DESeq2" "SummarizedExperiment"
## [637] "DelayedArray" "BiocParallel" "matrixStats"
## [640] "Biobase" "GenomicRanges" "GenomeInfoDb"
## [643] "IRanges" "S4Vectors" "BiocGenerics"
## [646] "parallel" "stats4" "stats"
## [649] "graphics" "grDevices" "utils"
## [652] "datasets" "methods" "base"
## [655] "pathview" "org.Hs.eg.db" "org.Mm.eg.db"
## [658] "DOSE" "clusterProfiler" "VennDiagram"
## [661] "futile.logger" "dplyr" "readr"
## [664] "RColorBrewer" "gplots" "mixOmics"
## [667] "MASS" "reshape" "lattice"
## [670] "ggplot2" "grid" "EnsDb.Mmusculus.v79"
## [673] "ensembldb" "AnnotationFilter" "GenomicFeatures"
## [676] "AnnotationDbi" "gridExtra" "tximport"
## [679] "limma" "DESeq2" "SummarizedExperiment"
## [682] "DelayedArray" "BiocParallel" "matrixStats"
## [685] "Biobase" "GenomicRanges" "GenomeInfoDb"
## [688] "IRanges" "S4Vectors" "BiocGenerics"
## [691] "parallel" "stats4" "stats"
## [694] "graphics" "grDevices" "utils"
## [697] "datasets" "methods" "base"
Set working directory to the folder that contains only gene count txt files
# Generate a tx2gene object for matrix generation
edb <- EnsDb.Mmusculus.v79
transcriptsID <- as.data.frame(transcripts(edb))
tx2gene <- as.data.frame(cbind(transcriptsID$tx_id, transcriptsID$tx_id))
# Generate DESeqData using the counting result generated using Salmon
setwd("/Users/mizuhi/OneDrive - Harvard University/Haigis Lab/Projects/Halo-Ago2/Halo-Ago-KRas/Raw Data/RNA-Seq/Mouse colon tumor/4-OHT enema model/Gene Counts")
inDir = getwd()
countFiles = list.files(inDir, pattern=".sf$", full.names = TRUE)
basename(countFiles)
## [1] "AV1_LIB038588_GEN00143059_R1.sf" "AV2_LIB038588_GEN00143060_R1.sf"
## [3] "AV3_LIB038588_GEN00143061_R1.sf" "AV4_LIB038588_GEN00143062_R1.sf"
## [5] "AV5_LIB041682_GEN00156282_R1.sf" "AV6_LIB041682_GEN00156283_R1.sf"
## [7] "AV7_LIB041682_GEN00156284_R1.sf" "AV8_LIB041682_GEN00156285_R1.sf"
## [9] "AVK1_LIB038588_GEN00143063_R1.sf" "AVK2_LIB038588_GEN00143064_R1.sf"
## [11] "AVK3_LIB038588_GEN00143065_R1.sf" "AVK4_LIB038588_GEN00143066_R1.sf"
## [13] "AVK5_LIB041682_GEN00156286_R1.sf" "AVK6_LIB041682_GEN00156287_R1.sf"
## [15] "AVK7_LIB041682_GEN00156288_R1.sf" "AVK8_LIB041682_GEN00156289_R1.sf"
names(countFiles) <- c('KrasWT_1','KrasWT_2','KrasWT_3','KrasWT_4','KrasWT_5','KrasWT_6','KrasWT_7','KrasWT_8','KrasG12D_1','KrasG12D_2','KrasG12D_3','KrasG12D_4','KrasG12D_5','KrasG12D_6','KrasG12D_7','KrasG12D_8')
txi.salmon <- tximport(countFiles, type = "salmon", tx2gene = tx2gene, ignoreTxVersion = TRUE)
## reading in files with read_tsv
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
## transcripts missing from tx2gene: 27668
## summarizing abundance
## summarizing counts
## summarizing length
DESeqsampletable <- data.frame(condition = c('control','control','control','control','control','control','control','control','experimental','experimental','experimental','experimental','experimental','experimental','experimental','experimental'))
DESeqsampletable$batch <- factor(c('1','1','1','1','2','2','2','2','1','1','1','1','2','2','2','2'))
DESeqsampletable$gender <- factor(c('F', 'F', 'M', 'M', 'M', 'F', 'M', 'M', 'F', 'M', 'M', 'F', 'M', 'M', 'F', 'M'))
rownames(DESeqsampletable) <- colnames(txi.salmon$counts)
ddsHTSeq<- DESeqDataSetFromTximport(txi.salmon, DESeqsampletable, ~condition + batch + gender)
## using counts and average transcript lengths from tximport
ddsHTSeq_norm <- estimateSizeFactors(ddsHTSeq)
## using 'avgTxLength' from assays(dds), correcting for library size
ddsHTSeq_norm <- DESeq(ddsHTSeq_norm)
## using pre-existing normalization factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
ddsHTSeq_analysis <- results(ddsHTSeq_norm, contrast = c("condition", "experimental", "control"))
ddsHTSeq_analysis <- lfcShrink(ddsHTSeq_norm, contrast = c("condition", "experimental", "control"), res = ddsHTSeq_analysis)
## using 'normal' for LFC shrinkage, the Normal prior from Love et al (2014).
## additional priors are available via the 'type' argument, see ?lfcShrink for details
MA plot was generated to inspect the correct shrinkage of LFC.
DESeq2::plotMA(ddsHTSeq_analysis)
Data is transformed and pseudocount is calculated.
rawCountTable <- as.data.frame(counts(ddsHTSeq_norm, normalized = TRUE))
pseudoCount = log2(rawCountTable + 1)
grid.arrange(
ggplot(pseudoCount, aes(x= KrasWT_1)) + xlab(expression(log[2](count + 1))) + ylab("Number of Genes") +
geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "KrasWT_1"),
ggplot(pseudoCount, aes(x= KrasWT_2)) + xlab(expression(log[2](count + 1))) + ylab("Number of Genes") +
geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "KrasWT_2"),
ggplot(pseudoCount, aes(x= KrasWT_3)) + xlab(expression(log[2](count + 1))) + ylab("Number of Genes") +
geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "KrasWT_3"), ggplot(pseudoCount, aes(x= KrasWT_4)) + xlab(expression(log[2](count + 1))) + ylab("Number of Genes") +
geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "KrasWT_4"), ggplot(pseudoCount, aes(x= KrasWT_5)) + xlab(expression(log[2](count + 1))) + ylab("Number of Genes") +
geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "KrasWT_5"),
ggplot(pseudoCount, aes(x= KrasWT_6)) + xlab(expression(log[2](count + 1))) + ylab("Number of Genes") +
geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "KrasWT_6"),
ggplot(pseudoCount, aes(x= KrasWT_7)) + xlab(expression(log[2](count + 1))) + ylab("Number of Genes") +
geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "KrasWT_7"), ggplot(pseudoCount, aes(x= KrasWT_8)) + xlab(expression(log[2](count + 1))) + ylab("Number of Genes") +
geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "KrasWT_8"), nrow = 2)
grid.arrange(
ggplot(pseudoCount, aes(x= KrasG12D_1)) + xlab(expression(log[2](count + 1))) + ylab("Number of Genes") +
geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "KrasG12D_1"),
ggplot(pseudoCount, aes(x= KrasG12D_2)) + xlab(expression(log[2](count + 1))) + ylab("Number of Genes") +
geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "KrasG12D_2"),
ggplot(pseudoCount, aes(x= KrasG12D_3)) + xlab(expression(log[2](count + 1))) + ylab("Number of Genes") +
geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "KrasG12D_3"),
ggplot(pseudoCount, aes(x= KrasG12D_4)) + xlab(expression(log[2](count + 1))) + ylab("Number of Genes") +
geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "KrasG12D_4"),
ggplot(pseudoCount, aes(x= KrasG12D_5)) + xlab(expression(log[2](count + 1))) + ylab("Number of Genes") +
geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "KrasG12D_5"),
ggplot(pseudoCount, aes(x= KrasG12D_6)) + xlab(expression(log[2](count + 1))) + ylab("Number of Genes") +
geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "KrasG12D_6"),
ggplot(pseudoCount, aes(x= KrasG12D_7)) + xlab(expression(log[2](count + 1))) + ylab("Number of Genes") +
geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "KrasG12D_7"),
ggplot(pseudoCount, aes(x= KrasG12D_8)) + xlab(expression(log[2](count + 1))) + ylab("Number of Genes") +
geom_histogram(colour = "white", fill = "#525252", binwidth = 0.6) + labs(title = "KrasG12D_8"), nrow = 2)
Check on the gene count distribution across all genes.
#Boxplots
suppressMessages(df <- melt(pseudoCount, variable_name = "Samples"))
df <- data.frame(df, Condition = substr(df$Samples,1,7))
ggplot(df, aes(x=Samples, y=value, fill = Condition)) + geom_boxplot() + xlab("") +
ylab(expression(log[2](count+1))) + scale_fill_manual(values = c("#619CFF", "#F564E3")) + theme(axis.text.x = element_text(angle = 90, hjust = 1))
#Histograms and density plots
ggplot(df, aes(x=value, colour = Samples, fill = Samples)) + ylim(c(0, 0.25)) +
geom_density(alpha = 0.2, size = 1.25) + facet_wrap(~ Condition) +
theme(legend.position = "top") + xlab(expression(log[2](count+1)))
MA plots are used to check for imbalance in sequencing depth between samples of the same condition. I did not generate MA plot for all library pairs. But the example pairs I selected show that there are imbalance in sequencing depth, but the imbalance is quite random and this is common in RNA-Seq datasets.
## WT1 vs WT2
x = pseudoCount[, 1]
y = pseudoCount[, 2]
## M-values
M = x - y
## A-values
A = (x + y)/2
df = data.frame(A, M)
suppressWarnings(
ggplot(df, aes(x = A, y = M)) + geom_point(size = 1.5, alpha = 1/5) +
geom_hline(color = "blue3", yintercept = 0) + stat_smooth(se = FALSE, method = "loess", color = "red3") + labs(title = "KrasWT_1 vs KrasWT_2"))
## WT1 vs WT3
x = pseudoCount[, 1]
y = pseudoCount[, 3]
## M-values
M = x - y
## A-values
A = (x + y)/2
df = data.frame(A, M)
suppressWarnings(
ggplot(df, aes(x = A, y = M)) + geom_point(size = 1.5, alpha = 1/5) +
geom_hline(color = "blue3", yintercept = 0) + stat_smooth(se = FALSE, method = "loess", color = "red3") + labs(title = "KrasWT_1 vs KrasWT_3"))
## WT1 vs WT5
x = pseudoCount[, 1]
y = pseudoCount[, 5]
## M-values
M = x - y
## A-values
A = (x + y)/2
df = data.frame(A, M)
suppressWarnings(
ggplot(df, aes(x = A, y = M)) + geom_point(size = 1.5, alpha = 1/5) +
geom_hline(color = "blue3", yintercept = 0) + stat_smooth(se = FALSE, method = "loess", color = "red3") + labs(title = "KrasWT_1 vs KrasWT_5"))
## WT1 vs WT6
x = pseudoCount[, 1]
y = pseudoCount[, 6]
## M-values
M = x - y
## A-values
A = (x + y)/2
df = data.frame(A, M)
suppressWarnings(
ggplot(df, aes(x = A, y = M)) + geom_point(size = 1.5, alpha = 1/5) +
geom_hline(color = "blue3", yintercept = 0) + stat_smooth(se = FALSE, method = "loess", color = "red3") + labs(title = "KrasWT_1 vs KrasWT_6"))
## G12D_1 vs G12D_2
x = pseudoCount[, 9]
y = pseudoCount[, 10]
## M-values
M = x - y
## A-values
A = (x + y)/2
df = data.frame(A, M)
suppressWarnings(
ggplot(df, aes(x = A, y = M)) + geom_point(size = 1.5, alpha = 1/5) +
geom_hline(color = "blue3", yintercept = 0) + stat_smooth(se = FALSE, method = "loess", color = "red3") + labs(title = "KrasG12D_1 vs KrasG12D_2"))
## G12D_1 vs G12D_3
x = pseudoCount[, 9]
y = pseudoCount[, 11]
## M-values
M = x - y
## A-values
A = (x + y)/2
df = data.frame(A, M)
suppressWarnings(
ggplot(df, aes(x = A, y = M)) + geom_point(size = 1.5, alpha = 1/5) +
geom_hline(color = "blue3", yintercept = 0) + stat_smooth(se = FALSE, method = "loess", color = "red3") + labs(title = "KrasG12D_1 vs KrasG12D_3"))
## G12D_1 vs G12D_5
x = pseudoCount[, 9]
y = pseudoCount[, 13]
## M-values
M = x - y
## A-values
A = (x + y)/2
df = data.frame(A, M)
suppressWarnings(
ggplot(df, aes(x = A, y = M)) + geom_point(size = 1.5, alpha = 1/5) +
geom_hline(color = "blue3", yintercept = 0) + stat_smooth(se = FALSE, method = "loess", color = "red3") + labs(title = "KrasG12D_1 vs KrasG12D_5"))
## G12D_1 vs G12D_6
x = pseudoCount[, 9]
y = pseudoCount[, 14]
## M-values
M = x - y
## A-values
A = (x + y)/2
df = data.frame(A, M)
suppressWarnings(
ggplot(df, aes(x = A, y = M)) + geom_point(size = 1.5, alpha = 1/5) +
geom_hline(color = "blue3", yintercept = 0) + stat_smooth(se = FALSE, method = "loess", color = "red3") + labs(title = "KrasG12D_1 vs KrasG12D_6"))
This is the sanity check step to confirm that under a un-supervised clustering, WT and G12D samples will cluster together. For some reason, the code is giving error when try to plot this heatmap in RStudio, so I created a pdf file that contains the heatmap in the Analysis folder named Hierchical Clustering.pdf
ddsHTSeq_transform <- varianceStabilizingTransformation(ddsHTSeq)
## using 'avgTxLength' from assays(dds), correcting for library size
assay(ddsHTSeq_transform) <- limma::removeBatchEffect(assay(ddsHTSeq_transform), ddsHTSeq_transform$batch)
rawCountTable_transform <- as.data.frame(assay(ddsHTSeq_transform))
pseudoCount_transform = log2(rawCountTable_transform + 1)
mat.dist = pseudoCount_transform
mat.dist = as.matrix(dist(t(mat.dist)))
mat.dist = mat.dist/max(mat.dist)
setwd("/Users/mizuhi/OneDrive - Harvard University/Haigis Lab/Projects/Halo-Ago2/Halo-Ago-KRas/Raw Data/RNA-Seq/Mouse colon tumor/4-OHT enema model/Analysis")
png('Hierchical_Clustering_transcript.png')
cim(mat.dist, symkey = FALSE, margins = c(6, 6))
suppressMessages(dev.off())
## quartz_off_screen
## 2
Final output is following:
I performed PCA analysis on all datasets to confirm that samples from the same condition group together. This step has to be performed using varianceStabelizingTransformed dataset, so that the high variance in genes with low counts will not skew the data.
The top 500 most variable genes are selected for PCA analysis.
plotPCA(ddsHTSeq_transform, intgroup = "condition", ntop = 500) +
geom_text(aes(label=name), vjust = 2, hjust = -0.1) +
xlim(-30,40) + ylim(-30,20)
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_text).
This step removes all genes with 0 counts across all samples, output a csv file and also generate a density plot using filtered dataset.
keep <- rowSums(rawCountTable) > 0
filterCount <- pseudoCount[keep,]
df <- melt(filterCount, variable_name = "Samples")
## Using as id variables
df <- data.frame(df, Condition = substr(df$Samples,1,7))
detected_raw_count_norm <- rawCountTable[keep,]
write.csv(detected_raw_count_norm, "normalized_raw_transcript_counts_filtered.csv")
ggplot(df, aes(x=value, colour = Samples, fill = Samples)) +
geom_density(alpha = 0.2, size = 1.25) + facet_wrap(~ Condition) +
theme(legend.position = "top") + xlab("pseudocounts")
This step generates the analysis file that contains results from differential analysis.
write.csv(as.data.frame(ddsHTSeq_analysis[keep,]), "Differential Analysis_transcript.csv")
Genes that were not detected were removed from the list. Genes with padj < 0.05 were considered significantly dysregulated. Their normalized counts were z-scored and used for plotting the heatmap.
suppressMessages(library(mosaic))
rawCountTable_transform_detected <- rawCountTable_transform[keep,]
dif_analysis <- as.data.frame(ddsHTSeq_analysis)[keep,]
sig_dif <- subset(dif_analysis, dif_analysis$padj < 0.05)
sig_index <- c()
for (i in 1:dim(sig_dif)[1]) {
sig_index <- c(sig_index ,grep(rownames(sig_dif)[i], rownames(rawCountTable_transform_detected)))
}
sig_count <- rawCountTable_transform_detected[sig_index,]
sig_dif <- cbind(sig_dif, sig_count)
for (i in 1:dim(sig_dif)[1]) {
sig_dif[i,7:22] <- zscore(as.numeric(sig_dif[i,7:22]))
}
my_palette <- colorRampPalette(c("blue", "white", "red"))(256)
heatmap_matrix <- as.matrix(sig_dif[,7:22])
png('G12D vs WT 4OHT enema colon tumor RNASeq_transcript.png',
width = 300,
height = 600,
res = 100,
pointsize = 8)
heatmap.2(heatmap_matrix,
main = "G12D vs WT\n4OHT enema\nColon tumor RNASeq transcript",
density.info = "none",
key = TRUE,
lhei = c(1,7),
col=my_palette,
cexCol = 1,
margins = c(8,2),
trace = "none",
dendrogram = "both",
labRow = FALSE,
keysize = 2,
ylab = "Genes",
Colv = "NA")
dev.off()
## quartz_off_screen
## 2
Final output is
# Scatter plot
detected_pseudocount <- pseudoCount[keep,]
dif_analysis$KrasG12D_mean <- rowMeans(detected_pseudocount[,9:16])
dif_analysis$KrasWT_mean <- rowMeans(detected_pseudocount[,1:8])
ggplot(dif_analysis, aes(x = KrasWT_mean, y = KrasG12D_mean)) +
xlab("WT_Average(log2)") + ylab("G12D_Average(log2)") +
geom_point(data = dif_analysis, alpha = 0.5, size = 1, color = "grey") +
geom_point(data = subset(dif_analysis, padj < 0.05 & log2FoldChange > 0), alpha = 0.5, size = 1, color = "red") +
geom_point(data = subset(dif_analysis, padj < 0.05 & log2FoldChange < 0), alpha = 0.5, size = 1, color = "blue") +
labs(title = "G12D vs WT Scatter Plot")
# MA plot
ggplot(dif_analysis, aes(x = log(baseMean,2), y = log2FoldChange,)) +
xlab("Average Expression") + ylab("LFC") +
geom_point(data = dif_analysis, alpha = 0.5, size = 1, color = "grey") +
geom_point(data = subset(dif_analysis, padj < 0.05 & log2FoldChange > 0), alpha = 0.5, size = 1, color = "red") +
geom_point(data = subset(dif_analysis, padj < 0.05 & log2FoldChange < 0), alpha = 0.5, size = 1, color = "blue") +
labs(title = "WT vs G12D MA Plot")
# Volcano Plot
ggplot(dif_analysis, aes(x = log2FoldChange, y = -log(padj,10))) +
xlab("LFC") + ylab("-log10(P value)") +
geom_point(data = dif_analysis, alpha = 0.5, size = 1, color = "black") +
geom_point(data = subset(dif_analysis, padj < 0.05 & log2FoldChange > 0), alpha = 0.5, size = 1, color = "red") +
geom_point(data = subset(dif_analysis, padj < 0.05 & log2FoldChange < 0), alpha = 0.5, size = 1, color = "blue") +
labs(title = "WT vs G12D Volcano Plot") +
xlim(-3,3) +
ylim(0,20)
## Warning: Removed 5302 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_point).